// RUN: mlir-opt -test-gpu-rewrite %s | FileCheck %s

// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
// CHECK: gpu.module @kernels {
gpu.module @kernels {

  // CHECK-LABEL: gpu.func @kernel(
  // CHECK-SAME: [[VAL_0:%.*]]: f32) workgroup([[VAL_1:%.*]] : memref<32xf32, #gpu.address_space<workgroup>>) kernel {
  gpu.func @kernel(%arg0 : f32) kernel {
    // CHECK-DAG:   [[VAL_2:%.*]] = arith.constant 31 : i32
    // CHECK-DAG:   [[VAL_3:%.*]] = arith.constant 0 : i32
    // CHECK-DAG:   [[VAL_4:%.*]] = arith.constant 0 : index
    // CHECK-DAG:   [[VAL_5:%.*]] = arith.constant 32 : i32
    // CHECK-DAG:   [[VAL_6:%.*]] = arith.constant 1 : i32
    // CHECK-DAG:   [[VAL_7:%.*]] = arith.constant 2 : i32
    // CHECK-DAG:   [[VAL_8:%.*]] = arith.constant 4 : i32
    // CHECK-DAG:   [[VAL_9:%.*]] = arith.constant 8 : i32
    // CHECK-DAG:   [[VAL_10:%.*]] = arith.constant 16 : i32
    // CHECK:   [[VAL_11:%.*]] = gpu.block_dim x
    // CHECK:   [[VAL_12:%.*]] = arith.index_cast [[VAL_11]] : index to i32
    // CHECK:   [[VAL_13:%.*]] = gpu.block_dim y
    // CHECK:   [[VAL_14:%.*]] = arith.index_cast [[VAL_13]] : index to i32
    // CHECK:   [[VAL_15:%.*]] = gpu.block_dim z
    // CHECK:   [[VAL_16:%.*]] = arith.index_cast [[VAL_15]] : index to i32
    // CHECK:   [[VAL_17:%.*]] = gpu.thread_id x
    // CHECK:   [[VAL_18:%.*]] = arith.index_cast [[VAL_17]] : index to i32
    // CHECK:   [[VAL_19:%.*]] = gpu.thread_id y
    // CHECK:   [[VAL_20:%.*]] = arith.index_cast [[VAL_19]] : index to i32
    // CHECK:   [[VAL_21:%.*]] = gpu.thread_id z
    // CHECK:   [[VAL_22:%.*]] = arith.index_cast [[VAL_21]] : index to i32
    // CHECK:   [[VAL_23:%.*]] = arith.muli [[VAL_22]], [[VAL_14]] : i32
    // CHECK:   [[VAL_24:%.*]] = arith.addi [[VAL_23]], [[VAL_20]] : i32
    // CHECK:   [[VAL_25:%.*]] = arith.muli [[VAL_24]], [[VAL_12]] : i32
    // CHECK:   [[VAL_26:%.*]] = arith.muli [[VAL_12]], [[VAL_14]] : i32
    // CHECK:   [[VAL_27:%.*]] = arith.addi [[VAL_25]], [[VAL_18]] : i32
    // CHECK:   [[VAL_28:%.*]] = arith.muli [[VAL_26]], [[VAL_16]] : i32
    // CHECK:   [[VAL_29:%.*]] = arith.andi [[VAL_27]], [[VAL_2]] : i32
    // CHECK:   [[VAL_30:%.*]] = arith.cmpi eq, [[VAL_29]], [[VAL_3]] : i32
    // CHECK:   [[VAL_31:%.*]] = arith.subi [[VAL_27]], [[VAL_29]] : i32
    // CHECK:   [[VAL_32:%.*]] = arith.subi [[VAL_28]], [[VAL_31]] : i32
    // CHECK:   [[VAL_33:%.*]] = arith.cmpi slt, [[VAL_32]], [[VAL_5]] : i32
    // CHECK:   cf.cond_br [[VAL_33]], ^bb1, ^bb17
    // CHECK: ^bb1:
    // CHECK:   [[VAL_34:%.*]], [[VAL_35:%.*]] = gpu.shuffle xor [[VAL_0]], [[VAL_6]], [[VAL_32]] : f32
    // CHECK:   cf.cond_br [[VAL_35]], ^bb2, ^bb3
    // CHECK: ^bb2:
    // CHECK:   [[VAL_36:%.*]] = arith.addf [[VAL_0]], [[VAL_34]] : f32
    // CHECK:   cf.br ^bb4([[VAL_36]] : f32)
    // CHECK: ^bb3:
    // CHECK:   cf.br ^bb4([[VAL_0]] : f32)
    // CHECK: ^bb4([[VAL_37:%.*]]: f32):
    // CHECK:   [[VAL_38:%.*]], [[VAL_39:%.*]] = gpu.shuffle xor [[VAL_37]], [[VAL_7]], [[VAL_32]] : f32
    // CHECK:   cf.cond_br [[VAL_39]], ^bb5, ^bb6
    // CHECK: ^bb5:
    // CHECK:   [[VAL_40:%.*]] = arith.addf [[VAL_37]], [[VAL_38]] : f32
    // CHECK:   cf.br ^bb7([[VAL_40]] : f32)
    // CHECK: ^bb6:
    // CHECK:   cf.br ^bb7([[VAL_37]] : f32)
    // CHECK: ^bb7([[VAL_41:%.*]]: f32):
    // CHECK:   [[VAL_42:%.*]], [[VAL_43:%.*]] = gpu.shuffle xor [[VAL_41]], [[VAL_8]], [[VAL_32]] : f32
    // CHECK:   cf.cond_br [[VAL_43]], ^bb8, ^bb9
    // CHECK: ^bb8:
    // CHECK:   [[VAL_44:%.*]] = arith.addf [[VAL_41]], [[VAL_42]] : f32
    // CHECK:   cf.br ^bb10([[VAL_44]] : f32)
    // CHECK: ^bb9:
    // CHECK:   cf.br ^bb10([[VAL_41]] : f32)
    // CHECK: ^bb10([[VAL_45:%.*]]: f32):
    // CHECK:   [[VAL_46:%.*]], [[VAL_47:%.*]] = gpu.shuffle xor [[VAL_45]], [[VAL_9]], [[VAL_32]] : f32
    // CHECK:   cf.cond_br [[VAL_47]], ^bb11, ^bb12
    // CHECK: ^bb11:
    // CHECK:   [[VAL_48:%.*]] = arith.addf [[VAL_45]], [[VAL_46]] : f32
    // CHECK:   cf.br ^bb13([[VAL_48]] : f32)
    // CHECK: ^bb12:
    // CHECK:   cf.br ^bb13([[VAL_45]] : f32)
    // CHECK: ^bb13([[VAL_49:%.*]]: f32):
    // CHECK:   [[VAL_50:%.*]], [[VAL_51:%.*]] = gpu.shuffle xor [[VAL_49]], [[VAL_10]], [[VAL_32]] : f32
    // CHECK:   cf.cond_br [[VAL_51]], ^bb14, ^bb15
    // CHECK: ^bb14:
    // CHECK:   [[VAL_52:%.*]] = arith.addf [[VAL_49]], [[VAL_50]] : f32
    // CHECK:   cf.br ^bb16([[VAL_52]] : f32)
    // CHECK: ^bb15:
    // CHECK:   cf.br ^bb16([[VAL_49]] : f32)
    // CHECK: ^bb16([[VAL_53:%.*]]: f32):
    // CHECK:   cf.br ^bb18([[VAL_53]] : f32)
    // CHECK: ^bb17:
    // CHECK:   [[VAL_54:%.*]], [[VAL_55:%.*]] = gpu.shuffle xor [[VAL_0]], [[VAL_6]], [[VAL_5]] : f32
    // CHECK:   [[VAL_56:%.*]] = arith.addf [[VAL_0]], [[VAL_54]] : f32
    // CHECK:   [[VAL_57:%.*]], [[VAL_58:%.*]] = gpu.shuffle xor [[VAL_56]], [[VAL_7]], [[VAL_5]] : f32
    // CHECK:   [[VAL_59:%.*]] = arith.addf [[VAL_56]], [[VAL_57]] : f32
    // CHECK:   [[VAL_60:%.*]], [[VAL_61:%.*]] = gpu.shuffle xor [[VAL_59]], [[VAL_8]], [[VAL_5]] : f32
    // CHECK:   [[VAL_62:%.*]] = arith.addf [[VAL_59]], [[VAL_60]] : f32
    // CHECK:   [[VAL_63:%.*]], [[VAL_64:%.*]] = gpu.shuffle xor [[VAL_62]], [[VAL_9]], [[VAL_5]] : f32
    // CHECK:   [[VAL_65:%.*]] = arith.addf [[VAL_62]], [[VAL_63]] : f32
    // CHECK:   [[VAL_66:%.*]], [[VAL_67:%.*]] = gpu.shuffle xor [[VAL_65]], [[VAL_10]], [[VAL_5]] : f32
    // CHECK:   [[VAL_68:%.*]] = arith.addf [[VAL_65]], [[VAL_66]] : f32
    // CHECK:   cf.br ^bb18([[VAL_68]] : f32)
    // CHECK: ^bb18([[VAL_69:%.*]]: f32):
    // CHECK:   cf.cond_br [[VAL_30]], ^bb19, ^bb20
    // CHECK: ^bb19:
    // CHECK:   [[VAL_70:%.*]] = arith.divsi [[VAL_27]], [[VAL_5]] : i32
    // CHECK:   [[VAL_71:%.*]] = arith.index_cast [[VAL_70]] : i32 to index
    // CHECK:   store [[VAL_69]], [[VAL_1]]{{\[}}[[VAL_71]]] : memref<32xf32, #gpu.address_space<workgroup>>
    // CHECK:   cf.br ^bb21
    // CHECK: ^bb20:
    // CHECK:   cf.br ^bb21
    // CHECK: ^bb21:
    // CHECK:   gpu.barrier
    // CHECK:   [[VAL_72:%.*]] = arith.addi [[VAL_28]], [[VAL_2]] : i32
    // CHECK:   [[VAL_73:%.*]] = arith.divsi [[VAL_72]], [[VAL_5]] : i32
    // CHECK:   [[VAL_74:%.*]] = arith.cmpi slt, [[VAL_27]], [[VAL_73]] : i32
    // CHECK:   cf.cond_br [[VAL_74]], ^bb22, ^bb41
    // CHECK: ^bb22:
    // CHECK:   [[VAL_75:%.*]] = arith.index_cast [[VAL_27]] : i32 to index
    // CHECK:   [[VAL_76:%.*]] = memref.load [[VAL_1]]{{\[}}[[VAL_75]]] : memref<32xf32, #gpu.address_space<workgroup>>
    // CHECK:   [[VAL_77:%.*]] = arith.cmpi slt, [[VAL_73]], [[VAL_5]] : i32
    // CHECK:   cf.cond_br [[VAL_77]], ^bb23, ^bb39
    // CHECK: ^bb23:
    // CHECK:   [[VAL_78:%.*]], [[VAL_79:%.*]] = gpu.shuffle xor [[VAL_76]], [[VAL_6]], [[VAL_73]] : f32
    // CHECK:   cf.cond_br [[VAL_79]], ^bb24, ^bb25
    // CHECK: ^bb24:
    // CHECK:   [[VAL_80:%.*]] = arith.addf [[VAL_76]], [[VAL_78]] : f32
    // CHECK:   cf.br ^bb26([[VAL_80]] : f32)
    // CHECK: ^bb25:
    // CHECK:   cf.br ^bb26([[VAL_76]] : f32)
    // CHECK: ^bb26([[VAL_81:%.*]]: f32):
    // CHECK:   [[VAL_82:%.*]], [[VAL_83:%.*]] = gpu.shuffle xor [[VAL_81]], [[VAL_7]], [[VAL_73]] : f32
    // CHECK:   cf.cond_br [[VAL_83]], ^bb27, ^bb28
    // CHECK: ^bb27:
    // CHECK:   [[VAL_84:%.*]] = arith.addf [[VAL_81]], [[VAL_82]] : f32
    // CHECK:   cf.br ^bb29([[VAL_84]] : f32)
    // CHECK: ^bb28:
    // CHECK:   cf.br ^bb29([[VAL_81]] : f32)
    // CHECK: ^bb29([[VAL_85:%.*]]: f32):
    // CHECK:   [[VAL_86:%.*]], [[VAL_87:%.*]] = gpu.shuffle xor [[VAL_85]], [[VAL_8]], [[VAL_73]] : f32
    // CHECK:   cf.cond_br [[VAL_87]], ^bb30, ^bb31
    // CHECK: ^bb30:
    // CHECK:   [[VAL_88:%.*]] = arith.addf [[VAL_85]], [[VAL_86]] : f32
    // CHECK:   cf.br ^bb32([[VAL_88]] : f32)
    // CHECK: ^bb31:
    // CHECK:   cf.br ^bb32([[VAL_85]] : f32)
    // CHECK: ^bb32([[VAL_89:%.*]]: f32):
    // CHECK:   [[VAL_90:%.*]], [[VAL_91:%.*]] = gpu.shuffle xor [[VAL_89]], [[VAL_9]], [[VAL_73]] : f32
    // CHECK:   cf.cond_br [[VAL_91]], ^bb33, ^bb34
    // CHECK: ^bb33:
    // CHECK:   [[VAL_92:%.*]] = arith.addf [[VAL_89]], [[VAL_90]] : f32
    // CHECK:   cf.br ^bb35([[VAL_92]] : f32)
    // CHECK: ^bb34:
    // CHECK:   cf.br ^bb35([[VAL_89]] : f32)
    // CHECK: ^bb35([[VAL_93:%.*]]: f32):
    // CHECK:   [[VAL_94:%.*]], [[VAL_95:%.*]] = gpu.shuffle xor [[VAL_93]], [[VAL_10]], [[VAL_73]] : f32
    // CHECK:   cf.cond_br [[VAL_95]], ^bb36, ^bb37
    // CHECK: ^bb36:
    // CHECK:   [[VAL_96:%.*]] = arith.addf [[VAL_93]], [[VAL_94]] : f32
    // CHECK:   cf.br ^bb38([[VAL_96]] : f32)
    // CHECK: ^bb37:
    // CHECK:   cf.br ^bb38([[VAL_93]] : f32)
    // CHECK: ^bb38([[VAL_97:%.*]]: f32):
    // CHECK:   cf.br ^bb40([[VAL_97]] : f32)
    // CHECK: ^bb39:
    // CHECK:   [[VAL_98:%.*]], [[VAL_99:%.*]] = gpu.shuffle xor [[VAL_76]], [[VAL_6]], [[VAL_5]] : f32
    // CHECK:   [[VAL_100:%.*]] = arith.addf [[VAL_76]], [[VAL_98]] : f32
    // CHECK:   [[VAL_101:%.*]], [[VAL_102:%.*]] = gpu.shuffle xor [[VAL_100]], [[VAL_7]], [[VAL_5]] : f32
    // CHECK:   [[VAL_103:%.*]] = arith.addf [[VAL_100]], [[VAL_101]] : f32
    // CHECK:   [[VAL_104:%.*]], [[VAL_105:%.*]] = gpu.shuffle xor [[VAL_103]], [[VAL_8]], [[VAL_5]] : f32
    // CHECK:   [[VAL_106:%.*]] = arith.addf [[VAL_103]], [[VAL_104]] : f32
    // CHECK:   [[VAL_107:%.*]], [[VAL_108:%.*]] = gpu.shuffle xor [[VAL_106]], [[VAL_9]], [[VAL_5]] : f32
    // CHECK:   [[VAL_109:%.*]] = arith.addf [[VAL_106]], [[VAL_107]] : f32
    // CHECK:   [[VAL_110:%.*]], [[VAL_111:%.*]] = gpu.shuffle xor [[VAL_109]], [[VAL_10]], [[VAL_5]] : f32
    // CHECK:   [[VAL_112:%.*]] = arith.addf [[VAL_109]], [[VAL_110]] : f32
    // CHECK:   cf.br ^bb40([[VAL_112]] : f32)
    // CHECK: ^bb40([[VAL_113:%.*]]: f32):
    // CHECK:   store [[VAL_113]], [[VAL_1]]{{\[}}[[VAL_4]]] : memref<32xf32, #gpu.address_space<workgroup>>
    // CHECK:   cf.br ^bb42
    // CHECK: ^bb41:
    // CHECK:   cf.br ^bb42
    // CHECK: ^bb42:
    // CHECK:   gpu.barrier
    %sum = gpu.all_reduce add %arg0 uniform {} : (f32) -> (f32)
    gpu.return
  }

}
